import pandas as pd
import numpy as np
import math
import sqlite3
import os
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
from sklearn.impute import KNNImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder
from pandas_profiling import ProfileReport
from sklearn.cluster import MeanShift, DBSCAN, estimate_bandwidth
from collections import Counter
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.neighbors import NearestNeighbors
sns.set()
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.cm as cm
from sklearn.metrics import silhouette_score, silhouette_samples
from sklearn.cluster import KMeans
from sklearn.base import clone
from sklearn.manifold import TSNE
import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView
from collections import Counter
from os.path import join
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd
from sklearn.cluster import DBSCAN, KMeans, AgglomerativeClustering
from sklearn.base import clone
from sklearn.metrics import pairwise_distances
from scipy.cluster.hierarchy import dendrogram
from sklearn.manifold import TSNE
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.model_selection import train_test_split
import graphviz
sns.set()
from os.path import join
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans, AgglomerativeClustering
from sklearn.neighbors import KNeighborsClassifier
import sompy
from sompy.visualization.mapview import View2D
from sompy.visualization.bmuhits import BmuHitsView
from sompy.visualization.hitmap import HitMapView
data = pd.read_csv('donors.csv').drop(columns=['Unnamed: 0'])
percent_missing = data.isnull().sum() * 100 / len(data)
list(percent_missing )
data=data.drop(columns=['OSOURCE','MAILCODE','ZIP','PVASTATE','NOEXCH','RECINHSE','RECP3','RECPGVG','RECSWEEP',"MDMAUD",\
"HOMEOWNR",'DATASRCE','SOLP3','SOLIH','MAJOR','GEOCODE','LIFESRC','PEPSTRFL','HPHONE_D','RFA_2R',\
'MDMAUD_R','MDMAUD_F','MDMAUD_A','GEOCODE2'])
# Function to return 1 if the donor had responded to any mail order offer and 0 if he has not.
def donor_resp_maill_0_1(v1):
if v1>0:
return 1
else:
return 0
# Applying the function above to all the variables referent to the number of mails order offer responded.
data["hit_0_1"]= data ["HIT"].apply(lambda x:donor_resp_maill_0_1(x))
data["MBCRAFT_0_1"]= data ["MBCRAFT"].apply(lambda x:donor_resp_maill_0_1(x))
data["MBGARDEN_0_1"]= data ["MBGARDEN"].apply(lambda x:donor_resp_maill_0_1(x))
data["MBBOOKS_0_1"]= data ["MBBOOKS"].apply(lambda x:donor_resp_maill_0_1(x))
data["MBCOLECT_0_1"]= data ["MBCOLECT"].apply(lambda x:donor_resp_maill_0_1(x))
data["MAGFAML_0_1"]= data ["MAGFAML"].apply(lambda x:donor_resp_maill_0_1(x))
data["MAGFEM_0_1"]= data ["MAGFAML"].apply(lambda x:donor_resp_maill_0_1(x))
data["MAGMALE_0_1"]= data ["MAGMALE"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBGARDN_0_1"]= data ["PUBGARDN"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBCULIN_0_1"]= data ["PUBCULIN"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBHLTH_0_1"]= data ["PUBHLTH"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBDOITY_0_1"]= data ["PUBDOITY"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBNEWFN_0_1"]= data ["PUBNEWFN"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBPHOTO_0_1"]= data ["PUBPHOTO"].apply(lambda x:donor_resp_maill_0_1(x))
data["PUBOPP_0_1"]= data ["PUBOPP"].apply(lambda x:donor_resp_maill_0_1(x))
# Creating new variable "Pre_other_types" which is going to aggregate all the variables above where \
# 0 indicate that donor does not have interest on collecting any object
data["other_types_0_1"]= data["MBCRAFT_0_1"] + data ["MBGARDEN_0_1"] + data ["MBBOOKS_0_1"] + data ["MBCOLECT_0_1"]\
+ data ["MAGFAML_0_1"] + data ["MAGFEM_0_1"] + data ["MAGMALE_0_1"] + data ["PUBGARDN_0_1"]\
+ data ["PUBCULIN_0_1"] + data ["PUBHLTH_0_1"] + data ["PUBDOITY_0_1"]+ data ["PUBNEWFN_0_1"]\
+ data ["PUBPHOTO_0_1"] + data ["PUBOPP_0_1"]
# If the sum is equal to 0 means that the donor did not respond any email for all the other types
# If the sum is >0 means that the donor responded at least one email for other types
# Function to return "None" if the donor does not have interest in collecting certain objects and yes if he has interest.
def donor_resp_maill_y_n(v1, v2):
if (v1==0) & (v2==0):
return "None"
elif (v1>0) & (v2==0):
return "hit"
elif (v1==0) & (v2>0):
return "other types"
else:
return "both"
# Applying the function above to "data["hit_0_1"]" and "data["other_types_0_1"]"
# Creating new variable "donor_resp_maill_offer"
data["donor_resp_maill_offer"]=data.apply(lambda x: donor_resp_maill_y_n(x["hit_0_1"], x["other_types_0_1"]), axis=1)
data["donor_resp_maill_offer"].value_counts()
data=data.drop(columns=["HIT", "MBCRAFT", "MBGARDEN", "MBBOOKS", "MBCOLECT", "MAGFAML", "MAGFEM", "MAGMALE",\
"PUBGARDN", "PUBCULIN", "PUBHLTH", "PUBDOITY", "PUBNEWFN", "PUBPHOTO", "PUBOPP"])
data=data.drop(columns=["hit_0_1", "MBCRAFT_0_1", "MBGARDEN_0_1", "MBBOOKS_0_1", "MBCOLECT_0_1", "MAGFAML_0_1",\
"MAGFEM_0_1", "MAGMALE_0_1", "PUBGARDN_0_1", "PUBCULIN_0_1", "PUBHLTH_0_1", "PUBDOITY_0_1",\
"PUBNEWFN_0_1", "PUBPHOTO_0_1", "PUBOPP_0_1", "other_types_0_1"])
Rule applied for the variable: If the donor has responded “yes” on one of the objects we are going to consider this donor with interest in collecting things from third party. On other and if he responded “no” for all the object, the donor does not have interest in collecting objects from third party data sources. Classification: Yes: for all the donors with interest; No: All the donors with no interest.
# Function to return 0 if the donor does not have interest in collecting certain objects and 1 if he has interest
def donor_interest(v1):
if v1==' ':
return 0
else:
return 1
# Applying the function above on the each collecting object.
data["COLLECT1_0_1"]= data ["COLLECT1"].apply(lambda x:donor_interest(x))
data["VETERANS_0_1"]= data ["VETERANS"].apply(lambda x:donor_interest(x))
data["BIBLE_0_1"]= data ["BIBLE"].apply(lambda x:donor_interest(x))
data["CATLG_0_1"]= data ["CATLG"].apply(lambda x:donor_interest(x))
data["HOMEE_0_1"]= data ["HOMEE"].apply(lambda x:donor_interest(x))
data["PETS_0_1"]= data ["PETS"].apply(lambda x:donor_interest(x))
data["CDPLAY_0_1"]= data ["CDPLAY"].apply(lambda x:donor_interest(x))
data["STEREO_0_1"]= data ["STEREO"].apply(lambda x:donor_interest(x))
data["PCOWNERS_0_1"]= data ["PCOWNERS"].apply(lambda x:donor_interest(x))
data["PHOTO_0_1"]= data ["PHOTO"].apply(lambda x:donor_interest(x))
data["CRAFTS_0_1"]= data ["CRAFTS"].apply(lambda x:donor_interest(x))
data["FISHER_0_1"]= data ["FISHER"].apply(lambda x:donor_interest(x))
data["GARDENIN_0_1"]= data ["GARDENIN"].apply(lambda x:donor_interest(x))
data["BOATS_0_1"]= data ["BOATS"].apply(lambda x:donor_interest(x))
data["WALKER_0_1"]= data ["WALKER"].apply(lambda x:donor_interest(x))
data["KIDSTUFF_0_1"]= data ["KIDSTUFF"].apply(lambda x:donor_interest(x))
data["CARDS_0_1"]= data ["CARDS"].apply(lambda x:donor_interest(x))
data["PLATES_0_1"]= data ["PLATES"].apply(lambda x:donor_interest(x))
# Creating new variable which is going to aggregate all the variables above where 0 indicate that donor does not\
# have interest on collecting any object
data["Pre_donor_interest_collecting"]= data["COLLECT1_0_1"] + data ["VETERANS_0_1"] + data ["BIBLE_0_1"] +\
data ["CATLG_0_1"] + data ["HOMEE_0_1"] + data ["PETS_0_1"] + \
data ["CDPLAY_0_1"] + data ["STEREO_0_1"]+ data ["PCOWNERS_0_1"] + \
data ["PHOTO_0_1"]+ data ["CRAFTS_0_1"]+ data ["FISHER_0_1"] +\
data ["GARDENIN_0_1"] + data ["BOATS_0_1"] + data ["WALKER_0_1"]+\
data ["KIDSTUFF_0_1"] + data ["CARDS_0_1"]+ data ["PLATES_0_1"]
# Function to return "no" if the donor does not have interest in collecting certain objects and yes if he has interest.
def donor_interest_y_n(v2):
if v2==0:
return "no"
else:
return "yes"
# Applying the function above on the Pre_donor_interest_collecting and creating the variable donor_interest_collecting.
data["donor_interest_collecting"]= data ["Pre_donor_interest_collecting"].apply(lambda x:donor_interest_y_n(x))
data["donor_interest_collecting"].value_counts()
data=data.drop(columns=["COLLECT1","VETERANS", "BIBLE","CATLG","HOMEE","PETS","CDPLAY","STEREO","PCOWNERS","PHOTO",\
"CRAFTS","FISHER","GARDENIN","BOATS", "WALKER" ,"KIDSTUFF", "CARDS", "PLATES"])
data=data.drop(columns=["COLLECT1_0_1", "VETERANS_0_1", "BIBLE_0_1","CATLG_0_1", "HOMEE_0_1", "PETS_0_1", "CDPLAY_0_1",\
"STEREO_0_1", "PCOWNERS_0_1", "PHOTO_0_1", "CRAFTS_0_1", "FISHER_0_1", "GARDENIN_0_1",\
"BOATS_0_1", "WALKER_0_1", "KIDSTUFF_0_1", "CARDS_0_1", "PLATES_0_1"])
Rule applied for the variable: If the donor has responded “B”, “F”, “M”, on one of the following question “CHILD03, CHILD07, CHILD12, CHILD18” or a number different than “0” on the “NUMCHLD” we are going to consider that the donor has a child. On other hand if the donor has not response on “CHILD03, CHILD07, CHILD12, CHILD18” and “0” on the “NUMCHLD” we are going to consider that the donor has not a child Classification: Yes : for all the donors who have a child; No: All the donors who have not a child.
# Function to return 0 if the donor does not have child and 1 if he has.
def donor_child(v1):
if v1==' ':
return 0
else:
return 1
# Applying the function above on the variables "CHILD03", "CHILD07", "CHILD12", "CHILD18".
data["CHILD03_0_1"]= data ["CHILD03"].apply(lambda x:donor_child(x))
data["CHILD07_0_1"]= data ["CHILD07"].apply(lambda x:donor_child(x))
data["CHILD12_0_1"]= data ["CHILD12"].apply(lambda x:donor_child(x))
data["CHILD18_0_1"]= data ["CHILD18"].apply(lambda x:donor_child(x))
# Creating a function that recives dataframe and the column "NUMCHLD" and will return a series with binary values (0,1)
# which 0 indicates no child from the donor and 1 indicates at least 1 child from the donor.
def binary_values_numchild (data_frame, column):
import math
b=[]
for i in data_frame [column]:
if math.isnan(i) is True:
a=0
else:
a=1
b.append(a)
data_frame[column+"binary"]=b
binary_values_numchild(data, "NUMCHLD")
data['NUMCHLDbinary'].value_counts()
data['NUMCHLDbinary'].value_counts()
# Creating a function that recives dataframe, 5 columns and will return a series with binary values (0,1)
# which 0 indicates no child from the donor and 1 indicates at least 1 child from the donor.
def binary_values_ (data_frame, column1, column2, column3, column4, column5):
import math
b=[]
for i in list(range (0, len(data_frame))):
if (data_frame[column1][i] + data_frame[column2][i] + data_frame[column3][i] + data_frame[column4][i]\
+ data_frame[column5][i])==0:
a= 0
else:
a= 1
b.append(a)
data_frame['donor_have_child_y_n']=b
# Applying the function above on the variables "CHILD03_0_1", "CHILD07_0_1", "CHILD12_0_1", "CHILD18_0_1", "NUMCHLDbinary".
binary_values_(data, "CHILD03_0_1", "CHILD07_0_1", "CHILD12_0_1", "CHILD18_0_1", "NUMCHLDbinary")
data['donor_have_child_y_n'].value_counts()
CHIL1: Percent Children Under Age 7 CHIL2: Percent Children Age 7 - 13 CHIL3: Percent Children Age 14-17
Rule to be applied: • If CHIL1+ CHIL2+ CHIL1 is between 90 and 110: The donor has child (1) • Else: The donor has not child (0)
# Creating a function that recives dataframe, 3 columns and will return a series with binary values (0,1)
# which 0 indicates no child from the donor and 1 indicates at least 1 child from the donor.
def binary_values_3 (data_frame, column1, column2, column3):
import math
b=[]
for i in list(range (0, len(data_frame))):
if (data_frame[column1][i] + data_frame[column2][i] + data_frame[column3][i])>=90 and (data_frame[column1][i]\
+ data_frame[column2][i] + data_frame[column3][i])<=110:
a= 1
else:
a= 0
b.append(a)
data_frame['donor_have_child_y_n']=b
# Applying the function above on the variables "CHIL1", "CHIL2", "CHIL3".
binary_values_3(data, "CHIL1", "CHIL2", "CHIL3")
data['donor_have_child_y_n'].value_counts()
# Function to return "no" if the donor does not have interest in collecting certain objects and yes if he has interest.
def child_y_n(v2):
if v2==0:
return "no"
else:
return "yes"
# Applying the function above on the donor_have_child_y_n.
data["donor_have_child_y_n"]= data ["donor_have_child_y_n"].apply(lambda x:child_y_n(x))
data["donor_have_child_y_n"].value_counts()
data=data.drop(columns=["CHILD03", "CHILD07", "CHILD12", "CHILD18", "NUMCHLD"])
data=data.drop(columns=["CHILD03_0_1", "CHILD07_0_1", "CHILD12_0_1", "CHILD18_0_1", "NUMCHLDbinary"])
#Variable that stores the urban city level from the disaggregation of DOMAIN corresponding to the first byte
data['urban_city_level'] = [data['DOMAIN'][i][0] for i in range(len(data))]
#Updating the former variable data['urban_city_level'] with a name instead of the code
D=[]
for i in range(len(data['urban_city_level'])):
if data['urban_city_level'][i] == 'C':
D.append('City')
elif data['urban_city_level'][i] == 'S':
D.append('Suburban')
elif data['urban_city_level'][i] == 'T':
D.append('Town')
elif data['urban_city_level'][i] == 'R':
D.append('Rural')
elif data['urban_city_level'][i] == 'U':
D.append('Urban')
else:
D.append('Suburban')
data['urban_city_level']=D
data['urban_city_level'].value_counts()
#Variable that stores the urban city level from the disaggregation of DOMAIN corresponding to the second byte
B=[]
for o in range(len(data)):
if data['DOMAIN'][o]!=' ':
B.append(data['DOMAIN'][o][1])
else:
B.append(0)
data['social_economical_status']=B
modes = data['social_economical_status'].mode().loc[0]
modes
#Updating the former variable data['social_economical_status'] with a name instead of the code
E=[]
for o in range(len(data['social_economical_status'])):
if data['social_economical_status'][o] == '1':
E.append('Highest SES')
elif data['social_economical_status'][o] == '2':
E.append('Above average SES')
elif data['social_economical_status'][o] == '3':
E.append('Below average SES')
elif data['social_economical_status'][o] == '4':
E.append('Lowest SES')
else:
E.append('Above average SES')
data['social_economical_status']=E
data['social_economical_status'].value_counts()
data=data.drop(columns=['DOMAIN'])
• "DOB”: Date of birth with the following format “YYMM, Year/Month format” - The new variable will be "Age" and will only take in account the donor's year of birth relative to 2020.
• During this stage we found 2593 NaN in DO.B. In order to solve this problem, we decided to use the mean age per State. We proceeded this way because we found that the wealth varies from state to state and we will use this variable, "wealth 2", later on the project.
# Here we are using a for loop to create a new variable storing the age considering that the present year is 2018
x=[]
for i in data['DOB']:
if i is np.nan:
x.append(0)
else:
x.append(2020-int(i.split('-')[0]))
data['Age']=x
# Calculating the mean age per State
avg_age_per_state = data.loc[data["Age"]>=0].groupby(["STATE"])["Age"].mean().sort_values(ascending=False)
#Here we are using a for loop to fill all the missing cases using the average age per state
F=[]
for i in range(0,len(data['Age'])):
if data['Age'][i] == 0:
for j in avg_age_per_state.index:
if data['STATE'][i] == j:
a = avg_age_per_state[j].astype(int)
F.append(a)
else:
a = data['Age'][i]
F.append(a)
data['Age']=F
# Here we are using a for loop to fill all the missing cases with the average age of the neighboor
for i in range(0,len(data['Age'])):
if data['Age'][i]==0:
data['Age'][i] = data['AGE904'][i]
else:
data['Age'][i]=data['Age'][i]
# Grouping Age values into Classes: 01) underage : Age 0- 17 anos; 02) young : Age 18 - 65;
# 03) middle age: Age 66 - 79; 04) elderly : Age 80 - 99; 05) long-term elderly : Age > 102
data['Age']= data['Age'].astype(int)
data['Age'].value_counts()
i=[]
for o in range(len(data['Age'])):
if data['Age'][o] >=0 and data['Age'][o]<19:
i.append('Age 0-18')
elif data['Age'][o]>=19 and data['Age'][o]<25:
i.append('Age 18-24')
elif data['Age'][o]>=25 and data['Age'][o]<35:
i.append('Age 25-34')
elif data['Age'][o]>=35 and data['Age'][o]<45:
i.append("Age 35-44")
elif data['Age'][o]>=45 and data['Age'][o]<55:
i.append("Age 45-54")
elif data['Age'][o]>=55 and data['Age'][o]<65:
i.append("Age 55-64")
elif data['Age'][o]>=65 and data['Age'][o]<74:
i.append("Age 65-74")
else:
i.append('Age >= 75')
data['Group_age']=i
data['Group_age'].value_counts()
data.loc[data["Age"]==0]
# data=data.loc[~data["CONTROLN"].isin([173966,28195,14976,14973,14974,14971])].reset_index
data=data.drop([18715,37117,64356,65358,68255,84821],axis=0).reset_index(drop=True)
# .reset_index
data=data.drop(columns=['DOB'])
For that we will use three variables "WEALTH1”,"WEALTH2” and eventualy "INCOME”.
•WEALTH1 - Wealth Rating
•WEALTH2 - Wealth rating uses median family income and population statistics from each area to index relative wealth
within each state
•First step Compare the missing cases between "WEALTH1" (44732 NaN) and "WEALTH2" (43823 NaN):
print("Number of missing cases in Wealth 1 is :", data['WEALTH1'].isnull().sum())
print("Number of missing cases in Wealth 2 is :", data['WEALTH2'].isnull().sum())
NA2=list(data['WEALTH2'].isna()) #list of nans from wealth2
NA1=list(data['WEALTH1'].isna()) #list of nans from wealth1
# aaa=data.loc[(data["WEALTH2"]>=0) & (data["INCOME"]>=0)&(data["STATE"]=="AR")]
• Off all missing cases of wealth1 and wealht2, 21843 are in common, which leaves us with with 43823 - (44732-21843) = 20943 possible values to reduce number of NaN at WEALTH2
• For this reason we might this values(no NaN values) to complete WEALTH2 (feature with less number of NaN and also with a more significant ranking)
• Completing the WEALTH2 with this values from WEALTH would result on a reduction of 21980/43823 = 49% of the NaN present in WEALTH2
• To use this method we will need to check if the information from both rankings are consistent, for that reason, we will need to compute the median rank for each state according to WEALTH2 observations and measure the difference between WEALTH1 observations and median , as follows:
ranking_per_state_w2=pd.pivot_table(data,columns=['STATE'],values='WEALTH2',aggfunc=np.median)
ranking_per_state_w1=pd.pivot_table(data,columns=['STATE'],values='WEALTH1',aggfunc=np.median)
list(ranking_per_state_w2.values)
list(ranking_per_state_w1.values)
• As we may see between both columns (WEALTH1 & WEALTH2) there isnt much difference in median values per state. For that reason we will use the values from WEALTH1 to complete some missing cases from WEALTH2
l = []
for i in range(0,len(data['WEALTH2'])):
if NA1[i]!=True and NA2[i]==True:
a=data['WEALTH1'][i]
l.append(a)
if NA2[i] == True and NA1[i]==True:
b = np.nan
l.append(b)
if NA2[i] == False:
c=data['WEALTH2'][i]
l.append(c)
data['WEALTH2']=l
print ("With this we reduced the number of NaNs from 43823 to 21838")
data['WEALTH2'].isna().value_counts()
With this we reduced the number of NaNs from 43823 to 21838
To solve this we are going to use the median wealth2 per state as follows:
median_wealht_pivot = pd.pivot_table(data,values='WEALTH2',columns=['STATE'], aggfunc=(np.median))
list(median_wealht_pivot.values)
a=median_wealht_pivot.values
b=median_wealht_pivot.columns
a = [int(median_wealht_pivot[i]) for i in b]
median_wealht_per_state = {'ESTADOS':b,'WEALTH_MEDIAN':a}
median_wealht_per_state=pd.DataFrame(median_wealht_per_state)
h=[]
NA3=list(data['WEALTH2'].isna())
for i in range(0,len(data['WEALTH2'])):
if NA3[i]==True:
for j in range(0,len(median_wealht_per_state)):
if median_wealht_per_state['ESTADOS'][j] == data['STATE'][i]:
k = (median_wealht_per_state['WEALTH_MEDIAN'][j])
h.append(k)
ver4= data['STATE'][i]=='DC'
if NA3[i]==False or ver4==True:
J=data['WEALTH2'][i]
h.append(J)
data['WEALTH2']=h
data['WEALTH2'].isna().value_counts()
data.loc[data['WEALTH2'].isnull()]
data=data.drop([5237],axis=0).reset_index(drop=True)
data['WEALTH2'].isna().value_counts()
data=data.drop(columns=['INCOME','WEALTH1'])
i. If <span style="color:red"> POP90C4> POP90C5 the donor will be considered Male </span>;
ii. If <span style="color:red"> Else the donor will be considered Female </span>;.
data['GENDER'].value_counts() #counting all the values in the column Gender;
data['GENDER']=data['GENDER'].replace(' ', np.NaN) #replacing all the blank spaces with missing cases;
data['GENDER']=data['GENDER'].replace('U', np.NaN) #replacing all the U values with missing cases;
data['GENDER']=data['GENDER'].replace('J', np.NaN) #replacing all the J values with missing cases;
data['GENDER']=data['GENDER'].replace('C', np.NaN) #replacing all the C values with missing cases;
data['GENDER']=data['GENDER'].replace('A', np.NaN) #replacing all the A values with missing cases;
data['GENDER'].isna().value_counts() # Counting Missing cases
# First lets fill all the Missing cases with U = Unknown
data["GENDER"].fillna("U", inplace = True)
# Second lets fill all the possibles U cases using the T-code
b=[]
male=[1, 4, 6, 9, 13, 14, 18]
female= [2,3, 22, 28, 72, 116, 1002, 4002, 13002, 14002, 17002, 28028, 72002]
for i in range(0,len(data)):
if data['GENDER'][i]=='U' and data["TCODE"][i] in male:
b.append("M")
elif data['GENDER'][i]=='U' and data["TCODE"][i] in female:
b.append("F")
else:
b.append(data['GENDER'][i])
data['GENDER']=b
• POP90C4: Percent Male;
• POP90C5: Percent Female.
• Rule to be applied:
i. If POP90C4> POP90C5 the donor will be considered Male;
ii. Else the donor will be considered Female;.
b=[]
for i in range(0,len(data)):
if data['GENDER'][i]=='U' and data["POP90C4"][i] > data["POP90C5"][i]:
b.append("M")
elif data['GENDER'][i]=='U' and data["POP90C4"][i] < data["POP90C5"][i]:
b.append("F")
else:
b.append(data['GENDER'][i])
data['GENDER']=b
data['GENDER'].value_counts()
we are going to classify every donor according with following the criteria below:
• A=ACTIVE DONOR: Anyone who made their last donation at least 12 months ago before the email of the last promotion;
• L=LAPSING DONOR: Anyone who made their last donation between 13-24 months ago before the email of the last promotion;
• I=INACTIVE DONOR: Anyone who made their last donation 25+ months ago before the email of the last promotion.
g=map(lambda x: str(x), data['LASTDATE'])
last_gift=list(g)
year_last_gift=[]
month_last_gift=[]
for i in last_gift:
x=i.split('-')[0]
year_last_gift.append(x)
y=i.split('-')[1]
month_last_gift.append(y)
#year_last_gift represents the year of th last gift per donor
#month_last_gift represents the month of the last gift per donor
data["month_of_last_gift"]=month_last_gift
data["month_of_last_gift"]=data["month_of_last_gift"].astype(int)
d=map(lambda x: str(x), data['ADATE_2'])
last_mail=list(d)
year_last_email=[]
month_last_email=[]
for i in last_mail:
x=i.split('-')[0]
year_last_email.append(x)
y=i.split('-')[1]
month_last_email.append(y)
#year_last_email represents the year of the last promotion sent
#month_last_email represents the month of the last promotion sent
year_last_email = list(map(int, year_last_email))
month_last_email= list(map(int, month_last_email))
recency_lastdate_adate2=[]
difference_year2m=[]
difference_m=[]
for i in range(0,len(data['ADATE_2'])):
x = (int(year_last_email[i])-int(year_last_gift[i]))*12
difference_year2m.append(x)
y = int(month_last_email[i])-int(month_last_gift[i])
difference=x+y
recency_lastdate_adate2.append(difference)
#creates a list that stores the difference in months between last gift and last email
data["months_lastgift"]=recency_lastdate_adate2
ç=[]
for i in recency_lastdate_adate2:
if i <= 12:
q='ACTIVE DONOR'
ç.append(q)
if 12 < i <= 24:
w='LAPSING DONOR'
ç.append(w)
if i > 24:
t='INACTIVE DONOR'
ç.append(t)
data['DONOR_TYPE']=ç
data['DONOR_TYPE'].value_counts()
b=map(lambda x: str(x),data['MAXRDATE'])
largest_gift=list(b)
month_largest_gift=[]
for i in largest_gift:
y=i.split('-')[1]
month_largest_gift.append(y)
data["month_largest_gift"]=month_largest_gift
data["month_largest_gift"]=data["month_largest_gift"].astype(int)
b=map(lambda x: str(x),data['MINRDATE'])
minimum_gift=list(b)
month_minimum_gift=[]
for i in minimum_gift:
y=i.split('-')[1]
month_minimum_gift.append(y)
data["month_minimum_gift"]=month_minimum_gift
data["month_minimum_gift"]=data["month_minimum_gift"].astype(int)
data1=data.copy()
# Set CONTROLN as index
data1=data1.set_index("CONTROLN")
# Creating a daset for lapsed Donor
datasetlapsed=data1.loc[data1["DONOR_TYPE"]=="LAPSING DONOR"]
# Creating a daset for Active Donor
datasetactive=data1.loc[data1["DONOR_TYPE"]=="ACTIVE DONOR"]
# Creating a daset for Inactive Donor
datasetinactive=data1.loc[data1["DONOR_TYPE"]=="INACTIVE DONOR"]
metric_features=["RAMNTALL","MINRAMNT", "MAXRAMNT","IC5", "month_of_last_gift",\
"month_minimum_gift", "month_largest_gift", "WEALTH2", "IC1", \
"IC2","IC3","IC4", "AVGGIFT", "months_lastgift"]
# Prepare figure
fig = plt.figure(figsize=(12, 10))
# Obtain correlation matrix. Round the values to 2 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(datasetlapsed[metric_features].corr(method="pearson"), decimals=2)
# Build annotation matrix (values above |0.5| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.5
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does
matrix = np.triu(datasetlapsed[metric_features].corr())
# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, mask=matrix, cmap=sns.diverging_palette(220, 10, as_cmap=True),
fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)
# Layout
fig.subplots_adjust(top=0.95)
#fig.suptitle("Correlation Matrix", fontsize=20)
plt.savefig(os.path.join('Correlation_matrix.png'), dpi=300)
plt.show()
metric_features_nmulticolinearete=["RAMNTALL","IC1","AVGGIFT","month_minimum_gift","months_lastgift",
"month_largest_gift", "month_of_last_gift"]#,, "WEALTH2",
# count of missing values
datasetlapsed[metric_features_nmulticolinearete].isna().sum()
# descriptive statistics
datasetlapsed[metric_features_nmulticolinearete].describe(include="all").T # try with all and without all
clusterdataset=datasetlapsed[metric_features_nmulticolinearete].copy()
clusterdataset
clusterdataset_minmax = clusterdataset.copy()
clusterdataset_minmax
# Use MinMaxScaler to scale the data
scaler = MinMaxScaler()
clusterdataset_minmax_scaled = scaler.fit_transform(clusterdataset_minmax)
clusterdataset_minmax_scaled
# See what the fit method is doing (notice the trailing underscore):
print("Parameters fitted:\n", scaler.data_min_, "\n", scaler.data_max_)
clusterdataset_minmax[metric_features_nmulticolinearete] = clusterdataset_minmax_scaled
clusterdataset_minmax.head()
# Checking max and min of minmaxed variables
clusterdataset_minmax.describe().round(2)
# All Numeric Variables' Box Plots in one figure
sns.set()
# Prepare figure. Create individual axes where each box plot will be placed
fig, axes = plt.subplots(2, math.ceil(len(metric_features_nmulticolinearete) / 2), figsize=(15, 10))
# Plot data
# Iterate across axes objects and associate each box plot (hint: use the ax argument):
for ax, feat in zip(axes.flatten(), metric_features_nmulticolinearete): # Notice the zip() function and flatten() method
sns.boxplot(x=clusterdataset[feat], ax=ax)
# Layout
# Add a centered title to the figure:
#title = "Numeric Variables' Box Plots"
#plt.suptitle(title)
plt.savefig(os.path.join('numeric_variables_boxplots.png'), dpi=300)
plt.show()
# Adapted from:
# https://towardsdatascience.com/k-means-dbscan-gmm-agglomerative-clustering-mastering-the-popular-models-in-a-segmentation-c891a3818e29
from kneed import KneeLocator
neighbors=[10, 15, 50, 100, 500]#[10,15,20,25,30,35,40, 45, 50]
eps=[]
for i in neighbors:
nearest_neighbors = NearestNeighbors(n_neighbors=i)
neighbors = nearest_neighbors.fit(clusterdataset_minmax)
distances, indices = neighbors.kneighbors(clusterdataset_minmax)
distances = np.sort(distances[:,i-1], axis=0)
j = np.arange(len(distances))
knee = KneeLocator(j, distances, S=1, curve='convex', direction='increasing', interp_method='polynomial')
fig = plt.figure(figsize=(5, 5))
knee.plot_knee()
plt.xlabel("Points")
plt.ylabel("Distance")
eps.append(distances[knee.knee])
print(i, distances[knee.knee])
plt.savefig(os.path.join('eps.png'), dpi=350)
# dbscan = DBSCAN(eps= 0.19465529677899998, min_samples=10)
# # Then fit the model to your data using the fit method
# model = dbscan.fit(clusterdataset_minmax)
# # Calculate Silhoutte Score
# score = silhouette_score(clusterdataset_minmax, model.labels_, metric='euclidean')
# score
# addaptd from:
# https://towardsdatascience.com/k-means-dbscan-gmm-agglomerative-clustering-mastering-the-popular-models-in-a-segmentation-c891a3818e29
# Silhouette Score for DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.cluster import DBSCAN
def get_dbscan_score(dataframe, eps1, center):
'''
INPUT:
data - the dataset you want to fit kmeans to
center - the number of centers you want (the k value)
OUTPUT:
score - the Silhouette Score for DBSCAN
'''
#instantiate kmeans
dbscan = DBSCAN(eps= eps1, min_samples=center)
# Then fit the model to your data using the fit method
model = dbscan.fit(dataframe)
# Calculate Silhoutte Score
score = silhouette_score(dataframe, model.labels_, metric='euclidean')
return score
np.linspace(10, 40, 4).astype(int)
# Adapted from:
# https://towardsdatascience.com/k-means-dbscan-gmm-agglomerative-clustering-mastering-the-popular-models-in-a-segmentation-c891a3818e29
centers = np.linspace(10, 40, 4).astype(int)
eps11=eps
eps_=[]
for eps1 in eps11:
scores = []
min_samples= []
for center in centers:
scores.append(get_dbscan_score(clusterdataset_minmax, eps1, center))
eps_.append(eps1)
min_samples.append(center)
print (eps1, center,get_dbscan_score(clusterdataset_minmax, eps1, center) )
fig = plt.figure(figsize=(5, 5))
plt.plot(centers, scores, linestyle='--', marker='o', color='b');
plt.xlabel('min_samples');
plt.ylabel('Silhouette Score');
plt.title('Silhouette Score vs. min_samples');
plt.savefig(os.path.join('epsminptsshe.png'), dpi=300)
#df3 = pd.DataFrame(centers,columns=['min_samples'])
#df3['eps_'] =eps1
#df3['scores'] = scores
#df4 = df3[df3.scores == df3.scores.max()]
#print('Optimal number of min_samples based on silhouette score:', df4['min_samples'].tolist())
#save the scores and mim_samples to csv
df3 = pd.DataFrame(centers,columns=['min_samples'])
df3['eps_'] =eps1
df3['silhouette score'] = scores
df3.to_csv(os.path.join("scores and mim_samples.csv"), index=True)
df3
clusterdataset_minmax
# Based on the hyperparameters found in the previous class
dbscan = DBSCAN(eps=0.22481194244797262, min_samples=20, n_jobs=4)
dbscan_labels = dbscan.fit_predict(clusterdataset_minmax)
Counter(dbscan_labels)
Counter(dbscan_labels)
dbscan_labels
# Save the newly detected outliers (they will be classified later based on the final clusters)
df_outliers = clusterdataset_minmax[dbscan_labels==-1].copy() #use method copy
df_outliers.shape
# New df without outliers
clusterdataset_minmax_no_outliers= clusterdataset_minmax[dbscan_labels!=-1].copy()
clusterdataset_minmax_no_outliers.shape
range_clusters = range(1, 6)
inertia = []
for n_clus in range_clusters: # iterate over desired ncluster range
kmclust = KMeans(n_clusters=n_clus, init='k-means++', n_init=15, random_state=1)
kmclust.fit(clusterdataset_minmax_no_outliers)
inertia.append(kmclust.inertia_) # save the inertia of the given cluster solution
# The inertia plot
plt.figure(figsize=(9,5))
plt.plot(inertia)
plt.ylabel("Inertia: SSw")
plt.xlabel("Number of clusters")
plt.title("Inertia plot over clusters", size=15)
plt.show()
# Adapted from:
# https://scikit-learn.org/stable/auto_examples/cluster/plot_kmeans_silhouette_analysis.html#sphx-glr-auto-examples-cluster-plot-kmeans-silhouette-analysis-py
# Storing average silhouette metric
avg_silhouette = []
for nclus in range_clusters:
# Skip nclus == 1
if nclus == 1:
continue
# Create a figure
fig = plt.figure(figsize=(13, 7))
# Initialize the KMeans object with n_clusters value and a random generator
# seed of 10 for reproducibility.
kmclust = KMeans(n_clusters=nclus, init='k-means++', n_init=15, random_state=1)
cluster_labels = kmclust.fit_predict(clusterdataset_minmax_no_outliers)
# The silhouette_score gives the average value for all the samples.
# This gives a perspective into the density and separation of the formed clusters
silhouette_avg = silhouette_score(clusterdataset_minmax_no_outliers, cluster_labels)
avg_silhouette.append(silhouette_avg)
print(f"For n_clusters = {nclus}, the average silhouette_score is : {silhouette_avg}")
# Compute the silhouette scores for each sample
sample_silhouette_values = silhouette_samples(clusterdataset_minmax_no_outliers, cluster_labels)
y_lower = 10
for i in range(nclus):
# Aggregate the silhouette scores for samples belonging to cluster i, and sort them
ith_cluster_silhouette_values = sample_silhouette_values[cluster_labels == i]
ith_cluster_silhouette_values.sort()
# Get y_upper to demarcate silhouette y range size
size_cluster_i = ith_cluster_silhouette_values.shape[0]
y_upper = y_lower + size_cluster_i
# Filling the silhouette
color = cm.nipy_spectral(float(i) / nclus)
plt.fill_betweenx(np.arange(y_lower, y_upper),
0, ith_cluster_silhouette_values,
facecolor=color, edgecolor=color, alpha=0.7)
# Label the silhouette plots with their cluster numbers at the middle
plt.text(-0.05, y_lower + 0.5 * size_cluster_i, str(i))
# Compute the new y_lower for next plot
y_lower = y_upper + 10 # 10 for the 0 samples
plt.title("The silhouette plot for the various clusters.")
plt.xlabel("The silhouette coefficient values")
plt.ylabel("Cluster label")
# The vertical line for average silhouette score of all the values
plt.axvline(x=silhouette_avg, color="red", linestyle="--")
# The silhouette coefficient can range from -1, 1
xmin, xmax = np.round(sample_silhouette_values.min() -0.1, 2), np.round(sample_silhouette_values.max() + 0.1, 2)
plt.xlim([xmin, xmax])
# The (nclus+1)*10 is for inserting blank space between silhouette
# plots of individual clusters, to demarcate them clearly.
plt.ylim([0, len(clusterdataset_minmax_no_outliers) + (nclus + 1) * 10])
plt.yticks([]) # Clear the yaxis labels / ticks
plt.xticks(np.arange(xmin, xmax, 0.1))
plt.savefig(os.path.join('kmean++.png'), dpi=300)
# final cluster solution
number_clusters = 3
kmclust = KMeans(n_clusters=number_clusters, init='k-means++', n_init=15, random_state=1)
km_labels = kmclust.fit_predict(clusterdataset_minmax_no_outliers)
km_labels.shape
# Creating a dataFrame with labes
df_labels = pd.DataFrame(km_labels,columns=['labelskmeans++'])
df_labels['index'] =clusterdataset_minmax_no_outliers.index
df_labels.set_index("index", inplace = True)
# Characterizing the final clusters
df_kmeanspp = pd.concat((clusterdataset_minmax_no_outliers, df_labels), axis=1)
df_kmeanspp.groupby('labelskmeans++').count()
# Inter cluster distance map
from yellowbrick.cluster import InterclusterDistance
# Instantiate the clustering model and visualizer
visualizer = InterclusterDistance(kmclust)
visualizer.fit(df_kmeanspp) # Fit the data to the visualizer
visualizer.show() # Finalize and render the figure
plt.savefig(os.path.join('InterclusterDistance.png'), dpi=300)
def cluster_profiles(df, label_columns, figsize, compar_titles=None):
"""
Pass df with labels columns of one or multiple clustering labels.
Then specify this label columns to perform the cluster profile according to them.
"""
if compar_titles == None:
compar_titles = [""]*len(label_columns)
sns.set()
fig, axes = plt.subplots(nrows=len(label_columns), ncols=2, figsize=figsize, squeeze=False)
for ax, label, titl in zip(axes, label_columns, compar_titles):
# Filtering df
drop_cols = [i for i in label_columns if i!=label]
dfax = df.drop(drop_cols, axis=1)
# Getting the cluster centroids and counts
centroids = dfax.groupby(by=label, as_index=False).mean()
counts = dfax.groupby(by=label, as_index=False).count().iloc[:,[0,1]]
counts.columns = [label, "counts"]
# Setting Data
pd.plotting.parallel_coordinates(centroids, label, color=sns.color_palette(), ax=ax[0])
sns.barplot(x=label, y="counts", data=counts, ax=ax[1])
#Setting Layout
handles, _ = ax[0].get_legend_handles_labels()
cluster_labels = ["Cluster {}".format(i) for i in range(len(handles))]
# ax[0].annotate(text=titl, xy=(0.95,1.1), xycoords='axes fraction', fontsize=13, fontweight = 'heavy')
ax[0].legend(handles, cluster_labels) # Adaptable to number of clusters
ax[0].axhline(color="black", linestyle="--")
ax[0].set_title("Cluster Means - {} Clusters".format(len(handles)), fontsize=13)
ax[0].set_xticklabels(ax[0].get_xticklabels(), rotation=-20)
ax[1].set_xticklabels(cluster_labels)
ax[1].set_xlabel("")
ax[1].set_ylabel("Absolute Frequency")
ax[1].set_title("Cluster Sizes - {} Clusters".format(len(handles)), fontsize=13)
plt.subplots_adjust(hspace=0.4, top=0.90)
plt.suptitle("Cluster Simple Profilling", fontsize=23)
plt.show()
plt.savefig(os.path.join('cluster_profiles.png'), dpi=300)
# Profilling each cluster (product, behavior, merged)
cluster_profiles(
df_kmeanspp,
label_columns = ['labelskmeans++'],
figsize = (28, 13),
compar_titles = ["cluster"]
)
# This is step can be quite time consuming
two_dim = TSNE(random_state=42).fit_transform(df_kmeanspp)
# t-SNE visualization
pd.DataFrame(two_dim).plot.scatter(x=0, y=1, c=df_kmeanspp['labelskmeans++'], colormap='tab10', figsize=(15,10))
plt.savefig(os.path.join('t-sne.png'), dpi=300)
plt.show()
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)
sm = sompy.SOMFactory().build(
clusterdataset_minmax_no_outliers.values,
mapsize=(5, 5),
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=metric_features_nmulticolinearete
)
sm.train(n_job=4, verbose='info', train_rough_len=100, train_finetune_len=100)
# Visualizing the Component planes (feature values)
sns.set()
view2D = View2D(12,12,"", text_size=10)
view2D.show(sm, col_sz=3, what='codebook')
plt.subplots_adjust(top=0.90)
plt.suptitle("Component Planes", fontsize=20)
plt.show()
# Here you have U-matrix
u = sompy.umatrix.UMatrixView(12, 12, 'umatrix', show_axis=True, text_size=8, show_text=True)
UMAT = u.show(
sm,
distance2=1,
row_normalized=False,
show_data=True,
contooor=True, # Visualize isomorphic curves
blob=False
)
UMAT[1] # U-matrix values - no colorscale so we have to rely on the values :(
vhts = BmuHitsView(12,12,"Hits Map")
vhts.show(sm, anotate=True, onlyzeros=False, labelsize=12, cmap="Blues")
plt.show()
# This som implementation does not have a random seed parameter
# We're going to set it up ourselves
np.random.seed(42)
# Notice that the SOM did not converge - We're under a time constraint for this class
sm = sompy.SOMFactory().build(
clusterdataset_minmax_no_outliers.values,
mapsize=(50, 50),
initialization='random',
neighborhood='gaussian',
training='batch',
lattice='hexa',
component_names=metric_features_nmulticolinearete
)
sm.train(n_job=-1, verbose='info', train_rough_len=300, train_finetune_len=300)
# Coordinates of the units in the input space
sm.get_node_vectors()
# Component planes on the 50x50 grid
sns.set()
view2D = View2D(40,25,"", text_size=20)
view2D.show(sm, col_sz=3, what='codebook')
plt.subplots_adjust(top=0.90)
plt.suptitle("Component Planes", fontsize=20)
plt.savefig(os.path.join('Component Planes.png'), dpi=300)
plt.show()
# U-matrix of the 50x50 grid
u = sompy.umatrix.UMatrixView(12, 12, 'umatrix', show_axis=True, text_size=8, show_text=True)
UMAT = u.show(
sm,
distance2=1,
row_normalized=False,
show_data=True,
contooor=True # Visualize isomorphic curves
)
plt.savefig(os.path.join('umatrix.png'), dpi=300)
#df_=pd.DataFrame(sm.get_node_vectors())
#df_
# Perform K-Means clustering on top of the 2500 untis (sm.get_node_vectors() output)
kmeans = KMeans(n_clusters=3, init='k-means++', n_init=20, random_state=42)
nodeclus_labels = sm.cluster(kmeans)
hits = HitMapView(12, 12,"Clustering", text_size=10)
hits.show(sm, anotate=True, onlyzeros=False, labelsize=7, cmap="Pastel1")
plt.show()
# Check the nodes and and respective clusters
nodes = sm.get_node_vectors()
df_nodes = pd.DataFrame(nodes, columns=metric_features_nmulticolinearete)
df_nodes['label'] = nodeclus_labels
df_nodes['label'].unique()
# Obtaining SOM's BMUs labels
bmus_map = sm.find_bmu(clusterdataset_minmax_no_outliers)[0] # get bmus for each observation in df
df_bmus = pd.DataFrame(
np.concatenate(( clusterdataset_minmax_no_outliers, np.expand_dims(bmus_map,1)), axis=1),
index= clusterdataset_minmax_no_outliers.index, columns=np.append( clusterdataset_minmax_no_outliers.columns,"BMU")
)
df_bmus
# Get cluster labels for each observation
df_final = df_bmus.merge(df_nodes['label'], how='left', left_on="BMU", right_index=True)
df_final['label'].unique()
non_metric_features=["donor_resp_maill_offer", "donor_interest_collecting","donor_have_child_y_n",\
"urban_city_level", "social_economical_status", 'GENDER', 'Group_age', ]
metric_features_describe_cluster=["AVGGIFT", "NGIFTALL", "IC1", "MINRAMNT", "MAXRAMNT", "months_lastgift", "ETH14"]
# Creating a copy for catergorical dataset
datalapsed_caterorical= datasetlapsed[non_metric_features].copy()
# Transforming categorical variables into dummies
datalapsed_caterorical = pd.get_dummies(datalapsed_caterorical, non_metric_features)
datalapsed_caterorical
# Concatenating the origina data with df_labes for Characterizing the final clusters
datalapsed_caterorical = pd.concat((datalapsed_caterorical, df_labels), axis=1)
demographic_analises= datalapsed_caterorical.groupby('labelskmeans++').sum()
demographic_analises.columns
# Table for urdan results
urban=['urban_city_level_City','urban_city_level_Rural','urban_city_level_Suburban', 'urban_city_level_Town',
'urban_city_level_Urban']
demographic_analises[urban]
#save to csv
demographic_analises[urban].to_csv(os.path.join("urban.csv"), index=True)
# Table for social_economical_status results
social_economical_status= ['social_economical_status_Above average SES','social_economical_status_Below average SES',\
'social_economical_status_Highest SES','social_economical_status_Lowest SES']
demographic_analises[social_economical_status]
#save to csv
demographic_analises[social_economical_status].to_csv(os.path.join("social.csv"), index=True)
# Table for age results
age =['Group_age_Age 0-18', 'Group_age_Age 18-24','Group_age_Age 25-34', 'Group_age_Age 35-44', 'Group_age_Age 45-54',
'Group_age_Age 55-64', 'Group_age_Age 65-74', 'Group_age_Age >= 75']
demographic_analises[age]
#save to csv
demographic_analises[age].to_csv(os.path.join("age.csv"), index=True)
# Table for resp results
resp=['donor_resp_maill_offer_None', 'donor_resp_maill_offer_both',
'donor_resp_maill_offer_hit', 'donor_resp_maill_offer_other types']
demographic_analises[resp]
#save to csv
demographic_analises[resp].to_csv(os.path.join("resp.csv"), index=True)
# Table for interest results
interest=['donor_interest_collecting_no', 'donor_interest_collecting_yes']
demographic_analises[interest]
#save to csv
demographic_analises[interest].to_csv(os.path.join("interest.csv"), index=True)
# Table for sex results
sex=['GENDER_F', 'GENDER_M']
demographic_analises[sex]
#save to csv
demographic_analises[sex].to_csv(os.path.join("sex.csv"), index=True)
# Creating a copy for metric_features_describe_cluster dataset
datalapsed_metric_cluster= datasetlapsed[metric_features_describe_cluster].copy()
datalapsed_metric_cluster
# Concatenating the origina data with df_labes for Characterizing the final clusters
datalapsed_metric_cluster = pd.concat((datalapsed_metric_cluster, df_labels), axis=1)
# Creating a data frame with the mean values for each cluster
Cluster=datalapsed_metric_cluster.groupby('labelskmeans++').mean()
Cluster
#save AD to csv
Cluster.to_csv(os.path.join("Cluster.csv"), index=True)
# features for analyse the active donor
metric_features_describe_clusterActive=["AVGGIFT", "NGIFTALL", "IC1", "MINRAMNT", "MAXRAMNT", "months_lastgift",\
"DONOR_TYPE", "ETH14"]
# Creating a data frame with the mean values for each active donor
AD=datasetactive[metric_features_describe_clusterActive].groupby('DONOR_TYPE').mean()
AD
#save AD to csv for kaggle
AD.to_csv(os.path.join("AD.csv"), index=True)
# Creating a data frame with the mean values for each inactive donor
ID=datasetinactive[metric_features_describe_clusterActive].groupby('DONOR_TYPE').mean()
#save ID to csv for kaggle
ID.to_csv(os.path.join("ID.csv"), index=True)
ID
### Using the R²
#What proportion of each variables total SS is explained between clusters?
def get_ss_variables(df):
"""Get the SS for each variable
"""
ss_vars = df.var() * (df.count() - 1)
return ss_vars
def r2_variables(df, labels):
"""Get the R² for each variable
"""
sst_vars = get_ss_variables(df)
ssw_vars = np.sum(df.groupby(labels).apply(get_ss_variables))
return 1 - ssw_vars/sst_vars
# We are essentially decomposing the R² into the R² for each variable
r2_variables(df[metric_features.to_list() + ['merged_labels']], 'merged_labels').drop('merged_labels')
### Using a Decision Tree
We get the normalized total reduction of the criterion (gini or entropy) brought by that feature (also known as Gini importance).
# Preparing the data
X = df.drop(columns=['product_labels','behavior_labels','merged_labels'])
y = df.merged_labels
# Splitting the data
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
# Fitting the decision tree
dt = DecisionTreeClassifier(random_state=42, max_depth=3)
dt.fit(X_train, y_train)
print("It is estimated that in average, we are able to predict {0:.2f}% of the customers correctly".format(dt.score(X_test, y_test)*100))
# Assessing feature importance
pd.Series(dt.feature_importances_, index=X_train.columns)
# Predicting the cluster labels of the outliers
df_out['merged_labels'] = dt.predict(df_out.drop(columns=['rcn']))
df_out.head()
# Visualizing the decision tree
dot_data = export_graphviz(dt, out_file=None,
feature_names=X.columns.to_list(),
filled=True,
rounded=True,
special_characters=True)
graphviz.Source(dot_data)
# selecting the features for the marketing approach
marketing_aproch= ["month_largest_gift", "month_minimum_gift","MINRAMNT","MAXRAMNT", "ETH14"]# "RFA_2F", "RFA_2A",
# Creating a copy for metric_features_describe_cluster dataset
datalapsed_marketing_aproch= datasetlapsed[marketing_aproch].copy()
# Concatenating the origina data with df_labes for marketing approach
datalapsed_marketing_aproch = pd.concat((datasetlapsed[marketing_aproch], df_labels), axis=1)
datalapsed_marketing_aproch
# Approach 1: When who the clusters who spend most gift and when
# Adapted from:
# https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/#26.-Box-Plot
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import ConvexHull
# Import Data
df = datalapsed_marketing_aproch
# Agglomerative Clustering
#cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
#cluster.fit_predict(df[['Murder', 'Assault', 'UrbanPop', 'Rape']])
# Plot
plt.figure(figsize=(14, 10), dpi= 300)
plt.scatter(df.iloc[:,0], df.iloc[:,1], c=datalapsed_marketing_aproch["labelskmeans++"], cmap='tab10')
# Encircle
def encircle(x,y, ax=None, **kw):
if not ax: ax=plt.gca()
p = np.c_[x,y]
hull = ConvexHull(p)
poly = plt.Polygon(p[hull.vertices,:], **kw)
ax.add_patch(poly)
# Draw polygon surrounding vertices
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 0, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"]== 0, 'MAXRAMNT'], ec="k", fc="gold", alpha=0.2, linewidth=0, label="cluster 0")
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 1, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 1, 'MAXRAMNT'], ec="k", fc="tab:blue", alpha=0.2, linewidth=0, label="cluster 1")
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 2, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 2, 'MAXRAMNT'], ec="k", fc="tab:red", alpha=0.2, linewidth=0, label="cluster 2")
#encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 3, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 3, 'MAXRAMNT'], ec="k", fc="tab:green", alpha=0.2, linewidth=0, label="cluster 3")
#encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 4, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 4, 'Assault'], ec="k", fc="tab:orange", alpha=0.2, linewidth=0, label="Individuals")
# Decorations
plt.legend()
plt.xlabel('Month associated with the largest gift'); plt.xticks(fontsize=12)
plt.ylabel('Dollar amount of largest gift'); plt.yticks(fontsize=12)
#plt.title('Agglomerative Clustering of USArrests (5 Groups)', fontsize=22)
plt.savefig(os.path.join('month_largest_vs_MAXRANT.jpg'))
plt.show()
# Approach 2: When who the clusters who mim gift and when
# Adapted from:
# https://www.machinelearningplus.com/plots/top-50-matplotlib-visualizations-the-master-plots-python/#26.-Box-Plot
from sklearn.cluster import AgglomerativeClustering
from scipy.spatial import ConvexHull
# Import Data
df = datalapsed_marketing_aproch
# Agglomerative Clustering
#cluster = AgglomerativeClustering(n_clusters=5, affinity='euclidean', linkage='ward')
#cluster.fit_predict(df[['Murder', 'Assault', 'UrbanPop', 'Rape']])
# Plot
plt.figure(figsize=(14, 10), dpi= 300)
plt.scatter(df.iloc[:,0], df.iloc[:,1], c=datalapsed_marketing_aproch["labelskmeans++"], cmap='tab10')
# Encircle
def encircle(x,y, ax=None, **kw):
if not ax: ax=plt.gca()
p = np.c_[x,y]
hull = ConvexHull(p)
poly = plt.Polygon(p[hull.vertices,:], **kw)
ax.add_patch(poly)
# Draw polygon surrounding vertices
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 0, 'month_minimum_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"]== 0, 'MINRAMNT'], ec="k", fc="gold", alpha=0.2, linewidth=0, label="cluster 0")
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 1, 'month_minimum_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 1, 'MINRAMNT'], ec="k", fc="tab:blue", alpha=0.2, linewidth=0, label="cluster 1")
encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 2, 'month_minimum_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 2, 'MINRAMNT'], ec="k", fc="tab:red", alpha=0.2, linewidth=0, label="cluster 2")
#encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 3, 'month_minimum_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 3, 'MINRAMNT'], ec="k", fc="tab:green", alpha=0.2, linewidth=0, label="cluster 3")
#encircle(df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 4, 'month_largest_gift'], df.loc[datalapsed_marketing_aproch["labelskmeans++"] == 4, 'Assault'], ec="k", fc="tab:orange", alpha=0.2, linewidth=0, label="Individuals")
# Decorations
plt.legend()
plt.xlabel('Month associated with the smallest gift '); plt.xticks(fontsize=12)
plt.ylabel('Dollar amount of smallest gift'); plt.yticks(fontsize=12)
#plt.title('Agglomerative Clustering of USArrests (5 Groups)', fontsize=22)
plt.savefig(os.path.join('month_smallest_giftvs_MINRAMNT.jpg'))
plt.show()
# Approach 3: Puerto rican population
Pouertto rican=datalapsed_marketing_aproch.groupby(["labelskmeans++"]).mean()["ETH14"]
#save AD to csv for kaggle
datalapsed_marketing_aproch.groupby(["labelskmeans++"]).mean()["ETH14"].to_csv(os.path.join("Pouertto rican.csv"), index=True)
sm.cluster??